Prerequisites
Load required packages
library(tidyverse)
library(ggplot2)
library(rtweet)
library(readr)
Dataset
Import processed data, which can be found here.
#read preprocessed data
wines <- read.csv(file = '../data/processed_data/wines.csv')
Get sample of dataset
#set seed value to birthday of Ricardo Rodriguez, American wrestler and ring announcer and Dr. Reinaldo (Rei) Sanchez-Arias
set.seed(19630217)
#set percentage to test with for simplicity, if needed
percentage <- 5
wine_sample<- sample_n(wines, percentage/100*nrow(wines))
Split Taster data into different Data Frame
tasters <- wines %>%
select(taster_name, taster_twitter_handle) %>% unique()
tasters
Drop taster_twitter_handle in wines dataframe
wines <- wines %>%
select(-taster_twitter_handle)
head(wines)
Add Reviewer profile info
Each reviewer has there own bias. To offset that we made a “profile” for each reviewer which includes characteristics like: avg_points, sd_points, and var_points
taster_rating_profile <- wines %>%
group_by(taster_name) %>%
summarize(
avg_points = mean(points),
sd_points = sd(points),
var_points = var(points)
)
tasters <- inner_join(tasters, taster_rating_profile, by = "taster_name")
head(tasters)
Add Rating Classification
Add following classification to wine dataset as found on the website:
| Classic |
98-100 |
The pinnacle of quality. |
| Superb |
94-97 |
A great achievement. |
| Excellent |
90-93 |
Highly recommended. |
| Very Good |
87-89 |
Often good value; well recommended. |
| Good |
83-86 |
Suitable for everyday consumption; often good value. |
| Acceptable |
80-82 |
Can be employed in casual, less-critical circumstances |
# function to add rating
rating_category <- function(points){
if(points>=98){
return("Classic")
}
else if (points>=94){
return("Superb")
}
else if(points>=90){
return("Excellent")
}
else if(points>=87){
return("Very Good")
}
else if(points>=83){
return("Good")
}
else{
return("Acceptable")
}
}
wines<- wines %>%
rowwise() %>%
mutate(rating_category = rating_category(points))
head(wines)
Add Adjusted Points
Since, each reviewer has a different bias we created a normalized metric, norm_points, by looking at the number of standard deviatioins a wine is from the reviewer’s avg_points. This gives use a more accurate representation of which which wines are better than the rest.
normalize_points <- function(points, taster_name){
t <- tasters %>%
filter(taster_name == taster_name)
nrow(t)
#return((points-t$avg_points)/t$sd_points)
}
wine_sample %>%
rowwise() %>%
mutate(norm_points = normalize_points(points, taster_name))
Data Exploration
Univariate Exploration
Correlation price by points, using DataExplorer library which can be found here
# TODO: IZZY
# TODO: OASKI (This is not producing correct results)
wines %>%
summarize(avg_price = mean(price, na.rm=TRUE),
sd_price = sd(price, na.rm=TRUE),
lowest_price = min(price, na.rm=TRUE),
highest_price = max(price,na.rm=TRUE))
# TODO: OASKI (This is not producing correct results)
wines %>%
summarize(avg_points = mean(points, na.rm=TRUE),
sd_points = sd(points, na.rm=TRUE),
lowest_points = min(points, na.rm=TRUE),
highest_points = max(points,na.rm=TRUE))
Price by Points
Notice the data is “stacked” and the socres range from 80-100
wines %>%
ggplot() +
geom_point(mapping = (aes(x = points, y = price)), na.rm = T, alpha = 0.15) +
labs(title = "Price by Points", x = "Points", y = "Price")
TODO: IZZY (Why did we log this?)
wines %>%
ggplot() +
geom_point(mapping = (aes(x = points, y = log(price))), na.rm = T, alpha = 0.15) +
labs(title = "log(Price) by Points", x = "Points", y = "log(Price)")
Data Analysis
#Find the best province for wine using the average points across the 1,000 samples #drop the descriptions or just select price? set points to max(points)
best_province <- wine_sample %>%
group_by(province, points) %>%
filter(points > 88.669)
best_province
Rating distribution
Best wine, by variety
#wine_best_variety <-
wines %>%
group_by(variety) %>%
summarise(mean_points = mean(points)) %>%
arrange(desc(mean_points))
user_price <- readline(prompt = "How much are you willing to spend on a bottle?")
user_price <- as.integer(user_price)
wines %>%
filter(price <= user_price) %>%
arrange(desc(points)) %>%
select(title, price, points)
Conclusion
LS0tCnRpdGxlOiAiRXhwbG9yaW5nIGFuZCBBbmFseWl6aW5nIFdpbmUgRW50aHVzaWFzdCBSZXZpZXdzIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgojIFByZXJlcXVpc2l0ZXMKCkxvYWQgcmVxdWlyZWQgcGFja2FnZXMKYGBge3IsIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9CmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KGRwbHlyKQpsaWJyYXJ5KGdncGxvdDIpCmxpYnJhcnkocnR3ZWV0KQpsaWJyYXJ5KHJlYWRyKQpgYGAKCiMgRGF0YXNldAoKSW1wb3J0IHByb2Nlc3NlZCBkYXRhLCB3aGljaCBjYW4gYmUgZm91bmQgW2hlcmVdKGh0dHBzOi8vZ2l0aHViLmNvbS9DNHJieW4zbTRuL3dpbmVfcmV2aWV3c19kYXRhX2FuYWx5c2lzL2Jsb2IvbWFzdGVyL2RhdGEvcHJvY2Vzc2VkX2RhdGEvcHJlcHJvY2Vzc2luZy5ybWQpLgoKYGBge3J9CiNyZWFkIHByZXByb2Nlc3NlZCBkYXRhCndpbmVzIDwtIHJlYWQuY3N2KGZpbGUgPSAnLi4vZGF0YS9wcm9jZXNzZWRfZGF0YS93aW5lcy5jc3YnKQpgYGAKCkdldCBzYW1wbGUgb2YgZGF0YXNldApgYGB7cn0KI3NldCBzZWVkIHZhbHVlIHRvIGJpcnRoZGF5IG9mIFJpY2FyZG8gUm9kcmlndWV6LCBBbWVyaWNhbiB3cmVzdGxlciBhbmQgcmluZyBhbm5vdW5jZXIgYW5kIERyLiBSZWluYWxkbyAoUmVpKSBTYW5jaGV6LUFyaWFzCnNldC5zZWVkKDE5NjMwMjE3KQoKI3NldCBwZXJjZW50YWdlIHRvIHRlc3Qgd2l0aCBmb3Igc2ltcGxpY2l0eSwgaWYgbmVlZGVkCnBlcmNlbnRhZ2UgPC0gNQp3aW5lX3NhbXBsZTwtIHNhbXBsZV9uKHdpbmVzLCBwZXJjZW50YWdlLzEwMCpucm93KHdpbmVzKSkKYGBgCgojIyMgU3BsaXQgVGFzdGVyIGRhdGEgaW50byBkaWZmZXJlbnQgRGF0YSBGcmFtZQoKYGBge3J9CnRhc3RlcnMgPC0gd2luZXMgJT4lCiAgc2VsZWN0KHRhc3Rlcl9uYW1lLCB0YXN0ZXJfdHdpdHRlcl9oYW5kbGUpICU+JSB1bmlxdWUoKQp0YXN0ZXJzCmBgYAoKRHJvcCBgdGFzdGVyX3R3aXR0ZXJfaGFuZGxlYCBpbiB3aW5lcyBkYXRhZnJhbWUKCmBgYHtyfQp3aW5lcyA8LSB3aW5lcyAlPiUKICBzZWxlY3QoLXRhc3Rlcl90d2l0dGVyX2hhbmRsZSkKaGVhZCh3aW5lcykKYGBgCiMjIEFkZCBSZXZpZXdlciBwcm9maWxlIGluZm8KCkVhY2ggcmV2aWV3ZXIgaGFzIHRoZXJlIG93biBiaWFzLiBUbyBvZmZzZXQgdGhhdCB3ZSBtYWRlIGEgInByb2ZpbGUiIGZvciBlYWNoIHJldmlld2VyIHdoaWNoIGluY2x1ZGVzIGNoYXJhY3RlcmlzdGljcyBsaWtlOiBgYXZnX3BvaW50c2AsIGBzZF9wb2ludHNgLCBhbmQgYHZhcl9wb2ludHNgCmBgYHtyfQp0YXN0ZXJfcmF0aW5nX3Byb2ZpbGUgPC0gd2luZXMgJT4lCiAgZ3JvdXBfYnkodGFzdGVyX25hbWUpICU+JQogIHN1bW1hcml6ZSgKICAgIGF2Z19wb2ludHMgPSBtZWFuKHBvaW50cyksCiAgICBzZF9wb2ludHMgPSBzZChwb2ludHMpLAogICAgdmFyX3BvaW50cyA9IHZhcihwb2ludHMpCiAgKQoKdGFzdGVycyA8LSBpbm5lcl9qb2luKHRhc3RlcnMsIHRhc3Rlcl9yYXRpbmdfcHJvZmlsZSwgYnkgPSAidGFzdGVyX25hbWUiKQpoZWFkKHRhc3RlcnMpCmBgYAojIyMgQWRkIFJhdGluZyBDbGFzc2lmaWNhdGlvbgoKQWRkIGZvbGxvd2luZyBjbGFzc2lmaWNhdGlvbiB0byB3aW5lIGRhdGFzZXQgYXMgZm91bmQgb24gdGhlIFt3ZWJzaXRlXShodHRwczovL3d3dy53aW5lbWFnLmNvbS8yMDEwLzA0LzA5L3lvdS1hc2tlZC1ob3ctaXMtYS13aW5lcy1zY29yZS1kZXRlcm1pbmVkLyk6Cgp8Q2F0ZWdvcnkgIHwgUmF0aW5nICB8IERlc2NyaXB0aW9uICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICB8CnwtLS0tLS0tLS0tfC0tLS0tLS0tLXwtLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLXwKfENsYXNzaWMgICB8CTk4LTEwMCB8IFRoZSBwaW5uYWNsZSBvZiBxdWFsaXR5LiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICB8CnxTdXBlcmIgICAgfAk5NC05NwkgfCBBIGdyZWF0IGFjaGlldmVtZW50LiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgfAp8RXhjZWxsZW50IHwJOTAtOTMJIHwgSGlnaGx5IHJlY29tbWVuZGVkLiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHwKfFZlcnkgR29vZCB8ICA4Ny04OQkgfCBPZnRlbiBnb29kIHZhbHVlOyB3ZWxsIHJlY29tbWVuZGVkLiAgICAgICAgICAgICAgICAgICAgfAp8R29vZAkgICAgIHwgIDgzLTg2CSB8IFN1aXRhYmxlIGZvciBldmVyeWRheSBjb25zdW1wdGlvbjsgb2Z0ZW4gZ29vZCB2YWx1ZS4gICB8CnxBY2NlcHRhYmxlfAk4MC04MgkgfCBDYW4gYmUgZW1wbG95ZWQgaW4gY2FzdWFsLCBsZXNzLWNyaXRpY2FsIGNpcmN1bXN0YW5jZXMgfAoKYGBge3J9CiMgZnVuY3Rpb24gdG8gYWRkIHJhdGluZwpyYXRpbmdfY2F0ZWdvcnkgPC0gZnVuY3Rpb24ocG9pbnRzKXsKICBpZihwb2ludHM+PTk4KXsKICAgIHJldHVybigiQ2xhc3NpYyIpCiAgfQogIGVsc2UgaWYgKHBvaW50cz49OTQpewogICAgcmV0dXJuKCJTdXBlcmIiKQogIH0KICBlbHNlIGlmKHBvaW50cz49OTApewogICAgcmV0dXJuKCJFeGNlbGxlbnQiKQogIH0KICBlbHNlIGlmKHBvaW50cz49ODcpewogICAgcmV0dXJuKCJWZXJ5IEdvb2QiKQogIH0KICBlbHNlIGlmKHBvaW50cz49ODMpewogICAgcmV0dXJuKCJHb29kIikKICB9CiAgZWxzZXsKICAgIHJldHVybigiQWNjZXB0YWJsZSIpCiAgfQp9Cgp3aW5lczwtIHdpbmVzICU+JQogIHJvd3dpc2UoKSAlPiUKICBtdXRhdGUocmF0aW5nX2NhdGVnb3J5ID0gcmF0aW5nX2NhdGVnb3J5KHBvaW50cykpCmhlYWQod2luZXMpCmBgYAoKIyMgQWRkIEFkanVzdGVkIFBvaW50cwoKU2luY2UsIGVhY2ggcmV2aWV3ZXIgaGFzIGEgZGlmZmVyZW50IGJpYXMgd2UgY3JlYXRlZCBhIG5vcm1hbGl6ZWQgbWV0cmljLCBgbm9ybV9wb2ludHNgLCBieSBsb29raW5nIGF0IHRoZSBudW1iZXIgb2Ygc3RhbmRhcmQgZGV2aWF0aW9pbnMgYSB3aW5lIGlzIGZyb20gdGhlIHJldmlld2VyJ3MgYGF2Z19wb2ludHNgLiBUaGlzIGdpdmVzIHVzZSBhIG1vcmUgYWNjdXJhdGUgcmVwcmVzZW50YXRpb24gb2Ygd2hpY2ggd2hpY2ggd2luZXMgYXJlIGJldHRlciB0aGFuIHRoZSByZXN0LgoKYGBge3J9Cm5vcm1hbGl6ZV9wb2ludHMgPC0gZnVuY3Rpb24ocG9pbnRzLCBuYW1lKXsKICB0IDwtIHRhc3RlcnMgJT4lCiAgICBmaWx0ZXIodGFzdGVyX25hbWUgPT0gbmFtZSkKICBucm93KHQpCiAgI3JldHVybigocG9pbnRzLXQkYXZnX3BvaW50cykvdCRzZF9wb2ludHMpCn0KCndpbmVfc2FtcGxlICU+JQogIHJvd3dpc2UoKSAlPiUKICBtdXRhdGUobm9ybV9wb2ludHMgPSBub3JtYWxpemVfcG9pbnRzKHBvaW50cywgdGFzdGVyX25hbWUpKQpgYGAKCiMgRGF0YSBFeHBsb3JhdGlvbgoKIyMgVW5pdmFyaWF0ZSBFeHBsb3JhdGlvbgpDb3JyZWxhdGlvbiBgcHJpY2VgIGJ5IGBwb2ludHNgLCB1c2luZyBgYGBEYXRhRXhwbG9yZXJgYGAgbGlicmFyeSB3aGljaCBjYW4gYmUgZm91bmQgW2hlcmVdKGh0dHBzOi8vZGF0YXNjaWVuY2VwbHVzLmNvbS9ibGF6aW5nLWZhc3QtZWRhLWluLXItd2l0aC1kYXRhZXhwbG9yZXIvKQpgYGB7cn0KIyBUT0RPOiBJWlpZCmBgYAoKYGBge3J9CiMgVE9ETzogT0FTS0kgKFRoaXMgaXMgbm90IHByb2R1Y2luZyBjb3JyZWN0IHJlc3VsdHMpCndpbmVzICU+JQogICAgc3VtbWFyaXplKGF2Z19wcmljZSA9IG1lYW4ocHJpY2UsIG5hLnJtPVRSVUUpLCAKICAgICAgICAgICAgICBzZF9wcmljZSA9IHNkKHByaWNlLCBuYS5ybT1UUlVFKSwKICAgICAgICAgICAgICBsb3dlc3RfcHJpY2UgPSBtaW4ocHJpY2UsIG5hLnJtPVRSVUUpLAogICAgICAgICAgICAgIGhpZ2hlc3RfcHJpY2UgPSBtYXgocHJpY2UsbmEucm09VFJVRSkpCmBgYAoKYGBge3J9CiMgVE9ETzogT0FTS0kgKFRoaXMgaXMgbm90IHByb2R1Y2luZyBjb3JyZWN0IHJlc3VsdHMpCndpbmVzICU+JQogICAgc3VtbWFyaXplKGF2Z19wb2ludHMgPSBtZWFuKHBvaW50cywgbmEucm09VFJVRSksIAogICAgICAgICAgICAgIHNkX3BvaW50cyA9IHNkKHBvaW50cywgbmEucm09VFJVRSksCiAgICAgICAgICAgICAgbG93ZXN0X3BvaW50cyA9IG1pbihwb2ludHMsIG5hLnJtPVRSVUUpLAogICAgICAgICAgICAgIGhpZ2hlc3RfcG9pbnRzID0gbWF4KHBvaW50cyxuYS5ybT1UUlVFKSkKYGBgCgojIyBQcmljZSBieSBQb2ludHMKTm90aWNlIHRoZSBkYXRhIGlzICJzdGFja2VkIiBhbmQgdGhlIHNvY3JlcyByYW5nZSBmcm9tIDgwLTEwMApgYGB7cn0Kd2luZXMgJT4lIAogIGdncGxvdCgpICsKICBnZW9tX3BvaW50KG1hcHBpbmcgPSAoYWVzKHggPSBwb2ludHMsIHkgPSBwcmljZSkpLCBuYS5ybSA9IFQsIGFscGhhID0gMC4xNSkgKwogIGxhYnModGl0bGUgPSAiUHJpY2UgYnkgUG9pbnRzIiwgeCA9ICJQb2ludHMiLCB5ID0gIlByaWNlIikKYGBgCgpUT0RPOiBJWlpZIChXaHkgZGlkIHdlIGxvZyB0aGlzPykKCmBgYHtyfQp3aW5lcyAlPiUgCiAgZ2dwbG90KCkgKwogIGdlb21fcG9pbnQobWFwcGluZyA9IChhZXMoeCA9IHBvaW50cywgeSA9IGxvZyhwcmljZSkpKSwgbmEucm0gPSBULCBhbHBoYSA9IDAuMTUpICsKICBsYWJzKHRpdGxlID0gImxvZyhQcmljZSkgYnkgUG9pbnRzIiwgeCA9ICJQb2ludHMiLCB5ID0gImxvZyhQcmljZSkiKQpgYGAKCiMgRGF0YSBBbmFseXNpcwoKI0ZpbmQgdGhlIGJlc3QgcHJvdmluY2UgZm9yIHdpbmUgdXNpbmcgdGhlIGF2ZXJhZ2UgcG9pbnRzIGFjcm9zcyB0aGUgMSwwMDAgc2FtcGxlcwojZHJvcCB0aGUgZGVzY3JpcHRpb25zIG9yIGp1c3Qgc2VsZWN0IHByaWNlPyBzZXQgcG9pbnRzIHRvIG1heChwb2ludHMpCmBgYHtyfQpiZXN0X3Byb3ZpbmNlIDwtIHdpbmVfc2FtcGxlICU+JSAKICBncm91cF9ieShwcm92aW5jZSwgcG9pbnRzKSAlPiUgCiAgZmlsdGVyKHBvaW50cyA+IDg4LjY2OSkKYmVzdF9wcm92aW5jZSAgCmBgYAoKClJhdGluZyBkaXN0cmlidXRpb24KCmBgYHtyfQoKYGBgCgpCZXN0IHdpbmUsIGJ5IHZhcmlldHkKYGBge3J9CiN3aW5lX2Jlc3RfdmFyaWV0eSA8LSAKd2luZXMgJT4lIAogIGdyb3VwX2J5KHZhcmlldHkpICU+JSAKICBzdW1tYXJpc2UobWVhbl9wb2ludHMgPSBtZWFuKHBvaW50cykpICU+JSAKICBhcnJhbmdlKGRlc2MobWVhbl9wb2ludHMpKSAKICAKYGBgCgpgYGB7cn0KdXNlcl9wcmljZSA8LSByZWFkbGluZShwcm9tcHQgPSAiSG93IG11Y2ggYXJlIHlvdSB3aWxsaW5nIHRvIHNwZW5kIG9uIGEgYm90dGxlPyIpCnVzZXJfcHJpY2UgPC0gYXMuaW50ZWdlcih1c2VyX3ByaWNlKQoKd2luZXMgJT4lIAogIGZpbHRlcihwcmljZSA8PSB1c2VyX3ByaWNlKSAlPiUgCiAgYXJyYW5nZShkZXNjKHBvaW50cykpICU+JSAKICBzZWxlY3QodGl0bGUsIHByaWNlLCBwb2ludHMpCmBgYAoKCiMgQ29uY2x1c2lvbgo=